
** Clearing Stata memory
capture log close
clear all
set more off, perm
set seed 1234

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////// Table O.35: Normalized Phase 2 Scores, Omissions = Predicted IRT Score //////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

*************************************************************************************************************************************
********************************  Calculating  Predicted IRT Score ******************************************************************
*************************************************************************************************************************************

** Opening Phase 2 norm_scores dataset 
use "Work Data/Gender_Phase2_long.dta",clear

*********************************************************************************
**************** Main sample ****************************************************
*********************************************************************************

* 1) Only years before the affirmative action took place
drop if aa_year==1
tab year

**********************************************************************************
****************** Question's score by order *************************************
**********************************************************************************

forvalues i=1(1)12 {
gen score_item`i'=.
levelsof subject, local(levels) 
foreach s of local levels {
replace score_item`i'=`s'`i'_st2 if subject=="`s'"
replace score_item`i'=. if missing_p2_`s'`i'==1 & subject=="`s'"
}
replace score_item`i'=2* score_item`i' // IRT command does not allow noninteger values
tab subject, sum(score_item`i')
}

keep score_item1-score_item12 subject year sex female inscri2

save "Work Data/IRT_Data.dta", replace

**********************************************************************************
****************** Predicted scores***********************************************
**********************************************************************************

forvalues j=1(1)4 {
foreach i in hist biol geog math phy chem port lang {	
use "Work Data/IRT_Data.dta", clear
keep if year==200`j' & subject=="`i'"
capture replace score_item6=. if year==2003 & subject=="phy"
missings dropvars, force
irt grm score_item1-score_item12
predict ability_`i', latent
forvalues z=1(1)12 {
capture noisily predict c_it`i'`z'_* if subject=="`i'", outcome(score_item`z')
forvalues k=1(1)11 {
capture noisily gen c_it`i'`z'_`k'=0
}
gen pred_`i'_`z'=c_it`i'`z'_2*.5+c_it`i'`z'_3+c_it`i'`z'_4*1.5+c_it`i'`z'_5*2+c_it`i'`z'_6*2.5+c_it`i'`z'_7*3+c_it`i'`z'_8*3.5+c_it`i'`z'_9*4+c_it`i'`z'_10*4.5+c_it`i'`z'_11*5 if subject=="`i'"
}
keep pred_* subject year inscri2
save "Work Data/Predicted_Scores_200`j'_`i'.dta", replace
clear
}
}

**********************************************************************************
***************** Generating Dataset of Predicted Scores *************************
**********************************************************************************

forvalues j=1(1)4 {
use "Work Data/Predicted_Scores_200`j'_biol.dta", clear
drop subject
foreach i in hist geog math phy chem port lang {	
merge 1:1 inscri2 year using "Work Data/Predicted_Scores_200`j'_`i'.dta"
drop _merge
save "Work Data/Predicted_Scores_200`j'_all.dta", replace
sleep 5000
}
}

forvalues j=0(1)3 {
append using "Work Data/Predicted_Scores_200`j'_all.dta"
sleep 5000
}
save "Work Data/Predicted_Score_all.dta", replace


*************************************************************************************************************************************
******************************** Regressions replacing the omissions for predicted scores  ******************************************
*************************************************************************************************************************************

** Opening Phase 2 norm_scores dataset 
use "Work Data/Gender_Phase2_long.dta",clear

*** Creating variables
encode subject, gen (sub)
tab subject, gen (d_sub)
label var sub "Subject"

** Subject dummies
rename d_sub1 Biology
rename d_sub2 Chemistry
rename d_sub3 Geography
rename d_sub4 History
rename d_sub5 Language
rename d_sub6 Mathematics
rename d_sub7 Physics
rename d_sub8 Portuguese
* Labels
label var Biology "Biology"
label var Chemistry "Chemistry"
label var Geography "Geography"
label var History "History"
label var Math "Mathematics"
label var Physics "Physics"
label var Portuguese "Portuguese"
label var Language "Foreign Language"

** Interaction: priority X female
gen fem_priority=female*priority
label var fem_priority "Female $\times$ Priority"

** Interaction: priority X subject
foreach v of varlist Biology-Portuguese {
gen fem_`v'=`v'*female
label var fem_`v' "Female $\times$ `v'"
gen prio_`v'=priority*`v'
label var prio_`v' "Priority $\times$ `v'"
gen fem_prio_`v'=fem_priority*`v'
label var fem_prio_`v' "Female $\times$ Priority $\times$ `v'"
}

global subject "Chemistry Geography History Mathematics Physics"
global subject_fem "fem_Chemistry fem_Geography fem_History fem_Mathematics fem_Physics"

** P1 scores: P1 normalized subject-specific scores
forvalues i=2(1)4 {
gen norm_p1score`i'=norm_p1score^`i'
sum norm_p1score`i'
}

*********************************************************************************
****************   Relative performances ****************************************
*********************************************************************************

** ENEM

foreach v in norm_enem_w {
bys year female: egen `v'_ave_g=mean(`v')
gen `v'_g=`v'-`v'_ave_g
bys year female: sum `v'_g
}
drop norm_enem_w_ave_g

* Interaction: subject X ENEM
foreach v of varlist Biology-Portuguese {
gen enem_`v'=`v'*norm_enem_w_g
label var enem_`v' "ENEM $\times$ `v'"
gen fem_enem_`v'=female*norm_enem_w_g*`v'
label var fem_enem_`v' "Female $\times$ ENEM $\times$ `v'"
forvalues i=2(1)4 {
gen enem_`v'_`i'=enem_`v'^`i'
gen fem_enem_`v'_`i'=fem_enem_`v'^`i'
}
sum enem_`v'* fem_enem_`v'*
}

global g_pol_enem_sub "enem_Chemistry* enem_Geography* enem_History* enem_Mathematics* enem_Physics*"
d $g_pol_enem_sub
global g_pol_fem_enem_sub "fem_enem_Chemistry* fem_enem_Geography* fem_enem_History* fem_enem_Mathematics* fem_enem_Physics*"
d $g_pol_fem_enem_sub

* Priority x relative performance in ENEM:
foreach v in norm_enem_w {
gen `v'_priority_g=`v'_g*priority
forvalues i=2(1)4 {
gen `v'_priority_g`i'=`v'_g^`i'*priority
sum `v'_priority_g`i'
}
}

global g_norm_enem_w_prio norm_enem_w_priority_g*
d $g_norm_enem_w_prio

** Phase 1 scores

foreach v in norm_p1score {

tab year, sum(`v')
bys year subject female: egen gs_`v'_ave=mean(`v')
gen gs_`v'=`v'-gs_`v'_ave
bys year female subject: sum gs_`v'
drop gs_`v'_ave

forvalues i=2(1)4 {
gen gs_`v'`i'=gs_`v'^`i'
sum gs_`v'`i'
}

global gs_pol_`v' gs_`v' gs_`v'2 gs_`v'3 gs_`v'4
d $gs_pol_`v'

* Priority x Phase 1 scores:
gen gs_`v'_prio=gs_`v'*priority
forvalues i=2(1)4 {
gen gs_`v'_prio`i'=gs_`v'`i'*priority
sum gs_`v'_prio*
}
}

global gs_pol_norm_p1score_prio gs_norm_p1score_prio*
d $gs_pol_norm_p1score_prio

*********************************************************************************
**************** Main sample ****************************************************
*********************************************************************************

* 1) Only years before the affirmative action took place
drop if aa_year==1
drop if year==2000
tab year
* We are not able to discriminate omissions from zero scores in years 2003 and 2004
keep if year<=2002
tab year

* 2) Drop Portuguese and Foreign Language (in Phase 1 there is no Portuguese or Foreign Language exams - For Portuguese Phase 1 has an essay)
 tab subject, sum(norm_p1score)
 drop if subject=="lang" | subject=="port" 
 tab subject, sum(norm_p1score)
 drop Language Portuguese prio_Language prio_Portuguese fem_prio_Language fem_prio_Portuguese 

*********************************************************************************
**************** Score replacing omission ***************************************
*********************************************************************************

joinby inscri2 using "Work Data/Predicted_Score_all.dta", unmatched(both) _merge(merge)

sum pred*

forvalues i=1(1)12 {
gen score_item`i'=.
levelsof subject, local(levels) 
foreach s of local levels {
replace score_item`i'=`s'`i'_st2 if subject=="`s'"
replace score_item`i'=pred_`s'_`i' if missing_p2_`s'`i'==1 & subject=="`s'"
}
tab subject, sum(score_item`i')
}

d score_item*
egen pred_score_P2=rowtotal(score_item*)
sum pred_score_P2 score

*** Normalizing Predicted Phase 2 scores by subject and year
bys year subject: sum pred_score_P2
bys year subject: egen mean_pscore=mean(pred_score_P2)
bys year subject: egen sd_pscore=sd(pred_score_P2)
gen norm_pscore =(pred_score_P2-mean_pscore)/sd_pscore
bys year subject: sum norm_pscore

*********************************************************************************
****************   Regressions **************************************************
*********************************************************************************


estimates clear

reg norm_pscore female priority fem_priority norm_enem_w_g , cluster(inscri2) 
estimates store reg1
d $subject $subject_fem 
reg norm_pscore female priority fem_priority norm_enem_w_g $subject $subject_fem , cluster(inscri2) 
estimates store reg2
reghdfe norm_pscore priority fem_priority $subject $subject_fem, cluster(inscri2) absorb(inscri2)
estimates store reg3
d $g_pol_enem_sub
reghdfe norm_pscore priority fem_priority $subject $subject_fem $g_pol_enem_sub, cluster(inscri2) absorb(inscri2)
estimates store reg4
d $g_norm_enem_w_prio
reghdfe norm_pscore priority fem_priority $subject $subject_fem $g_pol_enem_sub $g_norm_enem_w_prio, cluster(inscri2) absorb(inscri2)
estimates store reg5
d $gs_pol_norm_p1score
reghdfe norm_pscore priority fem_priority $subject $subject_fem $g_pol_enem_sub $g_norm_enem_w_prio $gs_pol_norm_p1score , cluster(inscri2) absorb(inscri2)
estimates store reg6
d $gs_pol_norm_p1score_prio
reghdfe norm_pscore priority fem_priority $subject $subject_fem $g_pol_enem_sub $g_norm_enem_w_prio $gs_pol_norm_p1score $gs_pol_norm_p1score_prio, cluster(inscri2) absorb(inscri2)
estimates store reg7

estadd local sub_fe "No":  reg1
estadd local sub_fe "Yes": reg2 reg3 reg4 reg5 reg6 reg7

estadd local subgender_fe "No":  reg1
estadd local subgender_fe "Yes": reg2 reg3 reg4 reg5 reg6 reg7

estadd local ind_fe "No": reg1 reg2 
estadd local ind_fe "Yes": reg3 reg4 reg5 reg6 reg7

estadd local enemsub "No":  reg1 reg2 reg3 
estadd local enemsub "Yes": reg4 reg5 reg6 reg7

estadd local enemprio_pol4 "No":  reg1 reg2 reg3 reg4 
estadd local enemprio_pol4 "Yes": reg5 reg6 reg7

estadd local p1score_pol4 "No": reg1 reg2 reg3 reg4 reg5
estadd local p1score_pol4 "Yes": reg6 reg7

estadd local p1scoreprio_pol4 "No": reg1 reg2 reg3 reg4 reg5 reg6
estadd local p1scoreprio_pol4 "Yes": reg7 

* Tex
esttab reg1 reg2 reg3 reg4 reg5 reg6 reg7 using "Output/p_IRT_predictedscorep2.tex", se star(* 0.10 ** 0.05 *** 0.01) nogap ///
stats(r2_a N N_clust sep sub_fe subgender_fe ind_fe enemsub  enemprio_pol4 p1score_pol4 p1scoreprio_pol4 , fmt(%9.3fc %9.0fc %9.0fc %1s %3s %3s %3s %3s %3s %3s %3s %3s) /// 
labels("$\bar{R}^2$" "Number of observations"  "Number of applicants" " " "Subject FE" "Subject-gender FE" "Individual FE" "ENEM $\times$ Subject FE" "ENEM $\times$ Priority" "Phase 1 scores" "Phase 1 scores $\times$ Priority"  )) b(%7.3f) se(%7.3f)  booktabs replace f label nomtitle collabels(none) keep(female priority fem_priority norm_enem_w_g) ///
refcat(female " \\ \multicolumn{8}{l}{\textit{Dependent variable: Phase 2 normalized subject-specific scores}} \\", nolabel)
